import pandas as pd
from scipy.stats import ttest_ind

c = 1000

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))

narratives_by_speech = df.groupby(['gpo_id']).sum().reset_index()[['gpo_id', 'n']]

df['chars_sentence_raw'] = df['sentence_raw'].str.len()
df['words_sentence_raw'] = [len(i.split(' ')) for i in list(df['sentence_raw'])]

length_measure = 'words_sentence_raw'

speeches = df.drop_duplicates(subset=['gpo_id'])[['gpo_id', 'party', length_measure]].merge(narratives_by_speech, on='gpo_id', how='outer', indicator=True)

speeches = speeches.dropna(subset=['party'])

speeches['narratives_per_len'] = speeches['n']/speeches[length_measure]

# Check if number of narratives over speech length vary by party
rep = speeches[speeches['party']=='Republican']
dem = speeches[speeches['party']=='Democrat']

print(ttest_ind(rep['narratives_per_len'], dem['narratives_per_len']))
print('Len units underlying one narrative:')
print('Democrats:', round(1/dem['narratives_per_len'].mean(),2))
print('Republicans:', round(1/rep['narratives_per_len'].mean(),2))
